Lab-5.2: Random forests

Author: Brian Kwon

Download: Click here

Instructions

Submission:

Demonstration

library('sparkline')

Classification trees

Setup

library(tidyverse)
library(tree)
library(ISLR)
library(caret)
library(rpart)
library(rattle)
library(visNetwork)
#attach(Carseats)
hist(Carseats$Sales)

Carseats <- Carseats %>% mutate(High = if_else(Sales <= 8, 0, 1) %>% as.factor())

Initial tree

tree.carseats=tree::tree(High~.,data=Carseats %>% select(-Sales))

summary(tree.carseats)
plot(tree.carseats)
text(tree.carseats,pretty=0) #annotate

Classification tree:
tree::tree(formula = High ~ ., data = Carseats %>% select(-Sales))
Variables actually used in tree construction:
[1] "ShelveLoc"   "Price"       "Income"      "CompPrice"   "Population" 
[6] "Advertising" "Age"         "US"         
Number of terminal nodes:  27 
Residual mean deviance:  0.4575 = 170.7 / 373 
Misclassification error rate: 0.09 = 36 / 400 

Train/test split

set.seed(2)
train=sample(1:nrow(Carseats), 200)
Carseats.test=Carseats[-train,]
tree.carseats=tree(High~.-Sales,Carseats,subset=train)
tree.pred=predict(tree.carseats,Carseats.test,type="class")
confusionMatrix(tree.pred,Carseats.test$High)
Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 104  33
         1  13  50
                                          
               Accuracy : 0.77            
                 95% CI : (0.7054, 0.8264)
    No Information Rate : 0.585           
    P-Value [Acc > NIR] : 2.938e-08       
                                          
                  Kappa : 0.5091          
                                          
 Mcnemar's Test P-Value : 0.005088        
                                          
            Sensitivity : 0.8889          
            Specificity : 0.6024          
         Pos Pred Value : 0.7591          
         Neg Pred Value : 0.7937          
             Prevalence : 0.5850          
         Detection Rate : 0.5200          
   Detection Prevalence : 0.6850          
      Balanced Accuracy : 0.7456          
                                          
       'Positive' Class : 0               
                                          

Pruning the tree using CV

set.seed(3)
cv.carseats=cv.tree(tree.carseats,FUN=prune.misclass)
names(cv.carseats)
cv.carseats
par(mfrow=c(1,2))
plot(cv.carseats$size,cv.carseats$dev,type="b")
plot(cv.carseats$k,cv.carseats$dev,type="b")
prune.carseats=prune.misclass(tree.carseats,best=9)
plot(prune.carseats)
text(prune.carseats,pretty=0)
tree.pred=predict(prune.carseats,Carseats.test,type="class")
confusionMatrix(tree.pred,Carseats.test$High)
  1. 'size'
  2. 'dev'
  3. 'k'
  4. 'method'
$size
[1] 21 19 14  9  8  5  3  2  1

$dev
[1] 74 76 81 81 75 77 78 85 81

$k
[1] -Inf  0.0  1.0  1.4  2.0  3.0  4.0  9.0 18.0

$method
[1] "misclass"

attr(,"class")
[1] "prune"         "tree.sequence"
Confusion Matrix and Statistics

          Reference
Prediction  0  1
         0 97 25
         1 20 58
                                          
               Accuracy : 0.775           
                 95% CI : (0.7108, 0.8309)
    No Information Rate : 0.585           
    P-Value [Acc > NIR] : 1.206e-08       
                                          
                  Kappa : 0.5325          
                                          
 Mcnemar's Test P-Value : 0.551           
                                          
            Sensitivity : 0.8291          
            Specificity : 0.6988          
         Pos Pred Value : 0.7951          
         Neg Pred Value : 0.7436          
             Prevalence : 0.5850          
         Detection Rate : 0.4850          
   Detection Prevalence : 0.6100          
      Balanced Accuracy : 0.7639          
                                          
       'Positive' Class : 0               
                                          

Pruning the tree manually

prune.carseats=prune.misclass(tree.carseats,best=15)
plot(prune.carseats)
text(prune.carseats,pretty=0)
tree.pred=predict(prune.carseats,Carseats.test,type="class")
confusionMatrix(tree.pred,Carseats.test$High)
Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 102  30
         1  15  53
                                          
               Accuracy : 0.775           
                 95% CI : (0.7108, 0.8309)
    No Information Rate : 0.585           
    P-Value [Acc > NIR] : 1.206e-08       
                                          
                  Kappa : 0.5241          
                                          
 Mcnemar's Test P-Value : 0.03689         
                                          
            Sensitivity : 0.8718          
            Specificity : 0.6386          
         Pos Pred Value : 0.7727          
         Neg Pred Value : 0.7794          
             Prevalence : 0.5850          
         Detection Rate : 0.5100          
   Detection Prevalence : 0.6600          
      Balanced Accuracy : 0.7552          
                                          
       'Positive' Class : 0               
                                          

Fitting Regression Trees

Initial tree - using rpart and rattle / visNetwork

library(MASS)
set.seed(1)
train = sample(1:nrow(Boston), nrow(Boston)/2)
tree.boston=tree(medv~.,Boston,subset=train)
summary(tree.boston)
tree.boston2=rpart(medv~.,Boston,subset=train)
fancyRpartPlot(tree.boston2)
visTree(tree.boston2)

Attaching package: 'MASS'


The following object is masked from 'package:dplyr':

    select


Regression tree:
tree(formula = medv ~ ., data = Boston, subset = train)
Variables actually used in tree construction:
[1] "rm"    "lstat" "crim"  "age"  
Number of terminal nodes:  7 
Residual mean deviance:  10.38 = 2555 / 246 
Distribution of residuals:
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-10.1800  -1.7770  -0.1775   0.0000   1.9230  16.5800 

Pruning tree - using rpart and visNetwork

cv.boston=cv.tree(tree.boston)
plot(cv.boston$size,cv.boston$dev,type='b')
prune.boston=prune.tree(tree.boston,best=5)
plot(prune.boston)
text(prune.boston,pretty=0)
yhat=predict(tree.boston,newdata=Boston[-train,])
boston.test=Boston[-train,"medv"]
plot(yhat,boston.test)
abline(0,1)
mean((yhat-boston.test)^2)

35.2868818594623

Bagging and Random Forests

Bagging mtry = # variables

library(randomForest)
set.seed(1)
bag.boston=randomForest(medv~.,data=Boston,subset=train,mtry=13,importance=TRUE)
bag.boston
yhat.bag = predict(bag.boston,newdata=Boston[-train,])
plot(yhat.bag, boston.test)
abline(0,1)
mean((yhat.bag-boston.test)^2)
randomForest 4.7-1.1

Type rfNews() to see new features/changes/bug fixes.


Attaching package: 'randomForest'


The following object is masked from 'package:rattle':

    importance


The following object is masked from 'package:dplyr':

    combine


The following object is masked from 'package:ggplot2':

    margin


Call:
 randomForest(formula = medv ~ ., data = Boston, mtry = 13, importance = TRUE,      subset = train) 
               Type of random forest: regression
                     Number of trees: 500
No. of variables tried at each split: 13

          Mean of squared residuals: 11.39601
                    % Var explained: 85.17
23.5927297079061

Adjust number of trees

bag.boston=randomForest(medv~.,data=Boston,subset=train,mtry=13,ntree=25)
yhat.bag = predict(bag.boston,newdata=Boston[-train,])
mean((yhat.bag-boston.test)^2)
23.6671575152042

Random forest mtry < # variables

set.seed(1)
rf.boston=randomForest(medv~.,data=Boston,subset=train,mtry=6,importance=TRUE)
yhat.rf = predict(rf.boston,newdata=Boston[-train,])
mean((yhat.rf-boston.test)^2)
importance(rf.boston)
varImpPlot(rf.boston)
19.6202073910648
A matrix: 13 × 2 of type dbl
%IncMSE IncNodePurity
crim 16.697017 1076.08786
zn 3.625784 88.35342
indus 4.968621 609.53356
chas 1.061432 52.21793
nox 13.518179 709.87339
rm 32.343305 7857.65451
age 13.272498 612.21424
dis 9.032477 714.94674
rad 2.878434 95.80598
tax 9.118801 364.92479
ptratio 8.467062 823.93341
black 7.579482 275.62272
lstat 27.129817 6027.63740

Boosting

library(gbm)
set.seed(1)
boost.boston=gbm(medv~.,data=Boston[train,],distribution="gaussian",n.trees=5000,interaction.depth=4)
summary(boost.boston)
par(mfrow=c(1,2))
plot(boost.boston,i="rm")
plot(boost.boston,i="lstat")
yhat.boost=predict(boost.boston,newdata=Boston[-train,],n.trees=5000)
mean((yhat.boost-boston.test)^2)
boost.boston=gbm(medv~.,data=Boston[train,],distribution="gaussian",n.trees=5000,interaction.depth=4,shrinkage=0.2,verbose=F)
yhat.boost=predict(boost.boston,newdata=Boston[-train,],n.trees=5000)
mean((yhat.boost-boston.test)^2)
Loaded gbm 2.1.8.1
A data.frame: 13 × 2
var rel.inf
<chr> <dbl>
rm rm 43.9919329
lstat lstat 33.1216941
crim crim 4.2604167
dis dis 4.0111090
nox nox 3.4353017
black black 2.8267554
age age 2.6113938
ptratio ptratio 2.5403035
tax tax 1.4565654
indus indus 0.8008740
rad rad 0.6546400
zn zn 0.1446149
chas chas 0.1443986

18.8470923079959
18.3345451839923

Lab Assignment

Problem 1:

Implement decision tree from scratch

  1. Warm up. Write a function called rss() that takes in a vector and returns the residual sum of squares (relative to the mean of the vector).
rss = function(v){
    mu = mean(v)
    rss = sum((v-mu)**2)
    return(rss)
}
  1. Best Split. We’re next going to write a function best_split() that will form the basis of a decision tree. This function will take in two arguments: a vector predictor variable (ie, x) and a vector target variable (ie, y). This function will identify the single location in the x domain that would yield the best split of the data, such that the two halves now each have least total RSS in the target variable.

Some things to think about: (i) If your input x vector has n data points, how many possible split locations are there? (ii) You can accomplish this task by brute force. For every possible split location, split the data into two parts and compute the new total RSS. Then just return whichever split location was the optimal one.

Make sure your function returns a few things (perhaps in a list): the location of the x split, the mean of y for each of the split parts, the improvement in RSS that was achieved by the split.

best_split = function(x,y){
    improvements = c()
    for(i in 1:(length(x)-1)){
        x1 = x[1:i]
        y1 = y[1:i]
        x2 = x[(i+1):length(x)]
        y2 = y[(i+1):length(y)]
        l_rss = rss(y1)
        r_rss = rss(y2)
        improvement = rss(y) - (l_rss+r_rss)
        improvements = c(improvements,improvement)
    }
    idx = which.max(improvements)
    l_mean = mean(y[1:idx])
    r_mean = mean(y[(idx+1):length(y)])
    return(c(idx,l_mean,r_mean,improvements[idx]))
}
  1. One Dimensional Data Here is a synthetic data set with one predictor and one response. Use your function to find out where the first split would be if \(y\) is predicted from \(x\) with a regression tree.
x = seq(0,10,by = .01)
y0 = cos(x/4 + x^2/5)*x^2/20 + x 
y = y0 + rnorm(length(x))
mydf = data.frame(x=x,y=y)
rss0 = 1000*var(y)
plot(x,y)
split0 <- best_split(x,y) 

What is the total RSS of y? What this RSS reduced to when you split the data?

cat("Total RSS of y:",rss(y),"\n")
cat("Reduced RSS after split:",rss(y)-split0[4])
Total RSS of y: 11459.58 
Reduced RSS after split: 5021.178
  1. Growing the Tree Split the lower half again. Split the upper half again. What is the total RSS now?
x1 = x[1:split0[1]]
y1 = y[1:split0[1]]
lower = best_split(x1,y1)

x2 = x[(split0[1]+1):length(x)]
y2 = y[(split0[1]+1):length(y)]
upper = best_split(x2,y2)

cat("Total RSS after splits:", lower[4]+upper[4])
Total RSS after splits: 1651.771
  1. Try this all again with the same data but with R’s tree() function. Make a plot of the tree. Do the first few splits agree with the results of your method?
cat("Initial split location:", x[split0[1]],"\n")
cat("Second split location (lower):", x1[lower[1]],"\n")
cat("Second split location (upper):", x2[upper[1]],"\n")
cat('\n')
cat("Second split lower left mean:", lower[2],"\n")
cat("Second split lower right mean:", lower[3],"\n")
cat("Second split upper left mean:", upper[2],"\n")
cat("Second split upper right mean:", upper[3],"\n")
Initial split location: 4.35 
Second split location (lower): 1.59 
Second split location (upper): 6.77 

Second split lower left mean: 0.9114517 
Second split lower right mean: 2.6857 
Second split upper left mean: 5.375028 
Second split upper right mean: 8.479011 
tree_model = tree(y ~ x,mydf)

plot(tree_model)
text(tree_model, pretty = 0)

Yes. My bset split function splits in the same location as tree function.

  1. Now that we have a good understanding of how to do recursive binary splitting of a single variable, how do would you handle multiple predictors?

I can apply recursive binary splitting on each predictors and compare the improvements of RSS. The actual split will happen on the predictor where there is the best improvements of RSS.

Problem 2:

Complete one of two problems on abalone shells or airfoil data

Your group will be assigned one of the following problems to go over during lab. You will walkthrough your results at the end of class

2a

Work with Abalone Data. These data may be found on the UC Irvine website. They give various physical characteristics of about 4000 abalone shellfish. The data were collected in Tasmania in 1995. Make sure that the data are in your source directory.

load("abalone.RData")
head(abalone)
A data.frame: 6 × 9
Sex Length Diameter Height WholeWeight ShuckedWeight VisceraWeight ShellWeight Rings
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 M 0.455 0.365 0.095 0.5140 0.2245 0.1010 0.150 15
2 M 0.350 0.265 0.090 0.2255 0.0995 0.0485 0.070 7
3 F 0.530 0.420 0.135 0.6770 0.2565 0.1415 0.210 9
4 M 0.440 0.365 0.125 0.5160 0.2155 0.1140 0.155 10
5 I 0.330 0.255 0.080 0.2050 0.0895 0.0395 0.055 7
6 I 0.425 0.300 0.095 0.3515 0.1410 0.0775 0.120 8
  1. We’ll try to predict the number of Rings, using the other features. Train a linear model as a baseline case. What is the RSS or Rsquared?
bm = lm(Rings ~ ., data = abalone)
summary(bm)
cat("RSS:",sum(bm$residuals**2),"\n")
cat("Rsquared:",summary(bm)$r.squared)

Call:
lm(formula = Rings ~ ., data = abalone)

Residuals:
     Min       1Q   Median       3Q      Max 
-10.4800  -1.3053  -0.3428   0.8600  13.9426 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)     3.89464    0.29157  13.358  < 2e-16 ***
SexI           -0.82488    0.10240  -8.056 1.02e-15 ***
SexM            0.05772    0.08335   0.692    0.489    
Length         -0.45834    1.80912  -0.253    0.800    
Diameter       11.07510    2.22728   4.972 6.88e-07 ***
Height         10.76154    1.53620   7.005 2.86e-12 ***
WholeWeight     8.97544    0.72540  12.373  < 2e-16 ***
ShuckedWeight -19.78687    0.81735 -24.209  < 2e-16 ***
VisceraWeight -10.58183    1.29375  -8.179 3.76e-16 ***
ShellWeight     8.74181    1.12473   7.772 9.64e-15 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.194 on 4167 degrees of freedom
Multiple R-squared:  0.5379,    Adjusted R-squared:  0.5369 
F-statistic: 538.9 on 9 and 4167 DF,  p-value: < 2.2e-16
RSS: 20060.73 
Rsquared: 0.5378844
  1. Now build a tree and plot it. We can make the annotation of the tree look better by reducing the font size with the parameter. What is the depth of the tree? How many leaves does it have?
tm = tree(Rings~.,abalone)
plot(tm,cex=0.5)
text(tm,pretty=0,cex=0.5)
summary(tm)

Regression tree:
tree(formula = Rings ~ ., data = abalone)
Variables actually used in tree construction:
[1] "ShellWeight"   "Sex"           "ShuckedWeight"
Number of terminal nodes:  10 
Residual mean deviance:  5.443 = 22680 / 4167 
Distribution of residuals:
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-6.8820 -1.5540 -0.5536  0.0000  0.9490 17.3000 

There are 10 leaves and the depth of the tree is 4.

  1. We can manually prune the tree to whatever depth we want. Use the function prune.tree() to simplify the tree so it only has 4 leaves. Visualize this tree.
ptm = prune.tree(tm,best=4)
plot(ptm)
text(ptm,pretty=0)

  1. Which two continuous predictors seem to be highly predictive according to the tree? Draw a sketch of the feature space and the splits in the space, as well as the predicted number of Rings for each region.

ShellWeight and Sex seems to be highly predictive according to the tree.

  1. Decision trees have high variance. Split the Abalone data in half and train two trees (and don’t worry about any extra pruning). Observe the differences between them, visualize the two different trees.
df1 = abalone[1:(nrow(abalone)/2),]
df2 = abalone[(nrow(abalone)/2+1):nrow(abalone),]
tm1 = tree(Rings ~ ., data = df1)
tm2 = tree(Rings ~ ., data = df2)

par(mfrow = c(1, 2))
plot(tm1)
text(tm1, pretty = 0, cex = 0.5)
plot(tm2)
text(tm2, pretty = 0, cex = 0.5)

Two trees are pretty similar to each other, having similar split locations and predictors.

2b

Let’s explore the airfoil data. We’re going to try to predict the “Pressure” feature from the other features. Use one of the tree aggregation methods we have learned about (Random Forest, Boosting, Bagging)

load("airfoil.RData")
head(airfoil)
A data.frame: 6 × 6
Frequency AngleOfAttack ChordLength FreeStreamVelocity Displacement Pressure
<int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 800 0 0.3048 71.3 0.00266337 126.201
2 1000 0 0.3048 71.3 0.00266337 125.201
3 1250 0 0.3048 71.3 0.00266337 125.951
4 1600 0 0.3048 71.3 0.00266337 127.591
5 2000 0 0.3048 71.3 0.00266337 127.461
6 2500 0 0.3048 71.3 0.00266337 125.571
  1. Start with some exploratory visualizations to see how the other features are related to Pressure. Feel free to use pairs(), or scatterplots, or boxplots. Do any features seem to be strongly predictive of Pressure?

  2. Create a train-test split of the data.

train=sample(1:nrow(airfoil), 200)
airfoil.test = airfoil[-train,]
  1. Fit a linear model, observe the \(R^2\) on the test set.
model = lm(Pressure ~ .,data=airfoil,subset=train)
pred = predict(model,airfoil.test)
summary(model)

Call:
lm(formula = Pressure ~ ., data = airfoil, subset = train)

Residuals:
     Min       1Q   Median       3Q      Max 
-16.2190  -3.0333  -0.0972   2.3987  13.6883 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)         1.342e+02  1.593e+00  84.233  < 2e-16 ***
Frequency          -1.585e-03  1.379e-04 -11.495  < 2e-16 ***
AngleOfAttack      -2.758e-01  1.153e-01  -2.392  0.01770 *  
ChordLength        -3.346e+01  4.896e+00  -6.835 1.04e-10 ***
FreeStreamVelocity  7.450e-02  2.284e-02   3.261  0.00131 ** 
Displacement       -2.120e+02  4.090e+01  -5.182 5.48e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.88 on 194 degrees of freedom
Multiple R-squared:  0.555, Adjusted R-squared:  0.5435 
F-statistic: 48.39 on 5 and 194 DF,  p-value: < 2.2e-16
  1. Fit a single decision tree, one with large depth. Observe the \(R^2\) on the test set. Make a plot of three if you wish.
tree.a = tree(Pressure~.,airfoil,subset=train)
# tree.pred = predict(tree.a,newdata = airfoil.test)
tree.pred = predict(tree.a,newdata = airfoil[-train,])
mean((tree.pred - airfoil.test$Pressure)^2)
summary(tree.a)
30.5232627779195

Regression tree:
tree(formula = Pressure ~ ., data = airfoil, subset = train)
Variables actually used in tree construction:
[1] "Frequency"     "Displacement"  "AngleOfAttack" "ChordLength"  
Number of terminal nodes:  16 
Residual mean deviance:  15.51 = 2853 / 184 
Distribution of residuals:
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-14.2400  -2.4920   0.2053   0.0000   2.5630  11.1400 
  1. Fit a tree aggregation model, observe the \(R^2\) on the test set.
rf.a = randomForest(Pressure~.,airfoil,subset=train)
rf.pred = predict(rf.a,newdata = airfoil.test)
mean((rf.pred - airfoil.test$Pressure)^2)
importance(rf.a)
varImpPlot(rf.a)
20.7396488641929
A matrix: 5 × 1 of type dbl
IncNodePurity
Frequency 2992.9822
AngleOfAttack 1284.8278
ChordLength 1017.7748
FreeStreamVelocity 515.1723
Displacement 2122.8262